in this article we're going to do some experiment with Classifiers like Bagged, Random Forest and Boosted tree on the titanic dataset... and we're going to add some bonus on other form of classification...
Bagging, random forests, and boosting use simple decision trees as building blocks to construct more powerful trees. By aggregating many decision trees, using methods like bagging, random forests, and boosting, the predictive performance of trees can be substantially improved. We are going to cover how to perform these models using python.
# import the necessary tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
#read and display the dataset
df = pd.read_csv('titanic.csv')
df.head()
df.info() #investigate the data
OVERALL THE DATA DATAFRAME LOOKS CLEAN JUST THOSE TWO COLUMNS AGE AND CABIN. ESPECIALLY CABIN
SO I THINK THE CABIN COLUMN DOESN'T HAVE ENOUGH DATA THAT OUR MODEL CAN LEARN FROM, FOR THAT WE'RE GOING TO DROP THAT COLUMN
AND MAYBE DROP THE ROWS WITH MISSING DATA IN AGE COLUM FOR SIMPLICITY
df.drop("Cabin", axis= 1, inplace=True)
df.dropna(how='any', inplace = True)
df.info()
# select the columns (categical values)
cols = ['Survived','Pclass', 'Sex', 'Embarked']
a = 0 #this var will serve as a graph location
plt.figure(figsize=(20,20))#set the canvas
for group in cols:#we interating through the categorical(that can help with group the data) values that can serve as y axis
for i, e in zip(cols, range(1,len(cols)+1)): #and the second loop will get us the location of the graph and the x axis value
if group != i: #here we check if the group and i (which are values coming from the same list) we check if they are not the same
#we use a variable that we created for graph location purpose and increment it to one before ploting
a += 1
#we set the ax so that each graph behave in terms of location
ax = plt.subplot(4,3,a)
#and below we use the first iterator (group) to group the and the second iterator to select a colon in that group the count and normalize the values
# and then we unstack it to prevent the bars from stacking on top of each other (prevent)
df.groupby(group)[i].value_counts(normalize=True).unstack().plot(kind='bar', ax=ax)
plt.title(group+' VS '+i) #and finaly set the title using our iterators
The above figure shed some light as to what determined a passenger survival... please appreciate the power of python for generating so much infomation with so little lines of code
# to prevent the changes that we gonna make to reflect in the original data we gonna make a copy of the datset
new_df = df.copy()
new_df.head()
# convert Sex and Embarked to digits
for i in ['Sex', 'Embarked']:
a = 0 #this is the digit we gonna use to replace the values in the two column
print(i) # print the column (just to keep tack of the process)
for valu in new_df[i].sort_values().unique(): # now we get the unique values in the the selected column sort the value
#cause we don't want the values to be raplace randomly
new_df.loc[new_df[i] == valu, i ] = a # loc the rows tha have the selected value, then replace all those values by a variable
print(valu,' = ',a) #we display the results
a +=1 # then we update the variable a
#check our new df
new_df.head()
new_df.isnull().sum()
# we create a list of columns that we identify as independent variables
independent = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
#get the x and y values y being our dependent var.
x = new_df.loc[:,independent].values
y = new_df.loc[:,'Survived'].values
print(y.shape, x.shape)
#import the split_tool class from sklearn
from sklearn.model_selection import train_test_split
#split the data into train and test set
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.40, random_state=1)
#split the test set into 2 set as well (for our dev set)
x_dev, x_test, y_dev, y_test = train_test_split(x_test,y_test, test_size= 0.50, random_state=1)
(x_dev.size/x.size), (x_test.size/x.size), (x_train.size/x.size)#check the split ratio.... not perfect but not too bad
#import the Decision maker from sklearn.tree
from sklearn.tree import DecisionTreeClassifier
#build the model and train the model
DecisionTreeClassifier(random_state = 1)
decision_model = DecisionTreeClassifier().fit(x_train, y_train)
#predict or test our model on the dev set
predict1 = decision_model.predict(x_dev)
#compute the metric just want to see what is patern
from sklearn.metrics import confusion_matrix ###for using confusion matrix###
cm = confusion_matrix(y_dev, predict1)
sns.heatmap(cm)#plot the matrix result
cm
#plot the decision tree
dot_data = StringIO()
#create a picture that will display our model performance or the background work
export_graphviz(decision_model, out_file = 'dot_data.dot', feature_names = independent,filled = True)
#import call and Image for picture
from subprocess import call
call(['dot', '-Tpng', 'dot_data.dot', '-o', 'dot_data.png', '-Gdpi=600'])
from IPython.display import Image
Image(filename = 'dot_data.png')
#compute the model's accuracy on the development set a
#prediction = decision_model.predict(x_dev)
decision1 = decision_model.score(x_dev, y_dev)
decision1
#74 is not good enouh but for a small demo model we can accomodate it
#create a model
DecisionTreeClassifier(random_state = 1)
#train the model tweak depth of the tree to 2
decision_model = DecisionTreeClassifier(max_depth=2).fit(x_train, y_train)
#test the model on the dev set
decision2 = decision_model.score(x_dev, y_dev)
#plot a pic of the decision patern
dot_data = StringIO()
export_graphviz(decision_model, out_file = 'dot_data.dot', feature_names = independent,filled = True, precision=True)
call(['dot', '-Tpng', 'dot_data.dot', '-o', 'dot_data.png', '-Gdpi=600'])
Image(filename = 'dot_data.png')
#create a model
DecisionTreeClassifier(random_state = 1)
#train the model tweak depth of the tree to 4
decision_model = DecisionTreeClassifier(max_depth=4).fit(x_train, y_train)
#test the model on the dev set
decision2 = decision_model.score(x_dev, y_dev)
#plot a pic of the decision patern
dot_data = StringIO()
export_graphviz(decision_model, out_file = 'dot_data.dot', feature_names = independent,filled = True, precision=True)
call(['dot', '-Tpng', 'dot_data.dot', '-o', 'dot_data.png', '-Gdpi=600'])
Image(filename = 'dot_data.png')
#create a model
DecisionTreeClassifier(random_state = 1)
#train the model tweak depth of the tree to 6
decision_model = DecisionTreeClassifier(max_depth=6).fit(x_train, y_train)
#test the model on the dev set
decision2 = decision_model.score(x_dev, y_dev)
#plot a pic of the decision patern
dot_data = StringIO()
export_graphviz(decision_model, out_file = 'dot_data.dot', feature_names = independent,filled = True, precision=True)
call(['dot', '-Tpng', 'dot_data.dot', '-o', 'dot_data.png', '-Gdpi=600'])
Image(filename = 'dot_data.png')
decision2
DecisionTreeClassifier
#create a model
DecisionTreeClassifier(random_state = 1)
#train the model tweak depth of the tree to 8
decision_model = DecisionTreeClassifier(max_depth=8).fit(x_train, y_train)
#test the model on the dev set
decision2 = decision_model.score(x_dev, y_dev)
#plot a pic of the decision patern
dot_data = StringIO()
export_graphviz(decision_model, out_file = 'dot_data.dot', feature_names = independent,filled = True, precision=True)
call(['dot', '-Tpng', 'dot_data.dot', '-o', 'dot_data.png', '-Gdpi=600'])
Image(filename = 'dot_data.png')
#create a model
DecisionTreeClassifier(random_state = 1)
#train the model tweak depth of the tree to 10
decision_model = DecisionTreeClassifier(max_depth=10).fit(x_train, y_train)
#test the model on the dev set
decision2 = decision_model.score(x_dev, y_dev)
#plot a pic of the decision patern
dot_data = StringIO()
export_graphviz(decision_model, out_file = 'dot_data.dot', feature_names = independent,filled = True, precision=True)
call(['dot', '-Tpng', 'dot_data.dot', '-o', 'dot_data.png', '-Gdpi=600'])
Image(filename = 'dot_data.png')
depth = [None, 2,4,6,8,10] #we create a list of the of our tree and from none to 10
x_ = [x_train, x_dev] #group the trai and dev set x axis and y axis together
y_ = [y_train, y_dev]
#these two list will save the result of our models for the training and the test on dev set
train_list = []
dev_list =[]
#setting labels for our scores
identifier = ['Training Accuracies', 'Development Accuracies']
#now we loop trough the x's and y's and their id's
for x, y, i in zip(x_, y_, identifier):
print('\n',i,'\n') # we print the id as the title of our algorithm
#now here is where all the maggic is happening
# for each depth
for d in depth:
#we create a model for it and stabilized the randomness for consistent result
DecisionTreeClassifier(random_state = 1)
#train the model of each depth
decision_model = DecisionTreeClassifier(max_depth=d).fit(x_train, y_train)
#get the score and print it
decision = decision_model.score(x, y)
print('Depth:',d,'\n', 'Accuracy:', decision)
# now we want also to save our model so we say if it training put the result in the training list else put it dev set
#we need these result for our plot
if i == 'Training Accuracies':
train_list.append(decision)
else:
dev_list.append(decision)
#let's now get a visual represent of our model and see which one we going to buy
plt.figure(figsize=(10,8))# as usual we set the canvas
#plot the the resulting lists
plt.plot(range(len(dev_list)),dev_list,color = 'r',marker = 'o', label='dev accuracy')
plt.plot(range(len(train_list)),train_list,color = 'g',marker = 'o', label = ' train accuracy')
plt.title('DECISION ACCURACY OF DEV AND TRAIN \n AT MAX_DEPTH RANGE FROM 0-10')
plt.xlabel('max-depth')
plt.ylabel('ACCURACY')
plt.legend()
Write down what shape the lines have and what this shape means. ● Pick an optimum value for the max_depth parameter. ● Report the accuracy of your final model on the test data.
So i think max_depth 8 is the one
#creating our final tree
DecisionTreeClassifier(random_state = 1)
#train it
decision_model = DecisionTreeClassifier(max_depth=8).fit(x_train, y_train)
#calculate the score
decision = decision_model.score(x_test, y_test)
decision
#ET VOILA
● In this task, we will continue with the titanic.ipynb notebook created in the previous task.
● Create a Bagged, Random Forest and Boosted tree for the titanic dataset in the same way that you created a regular Classification Tree.
● Pick one of these methods, and tune the parameters n_estimators and max_depth.
● Report the accuracy of all models and report which model performed the best, including the values for n_estimators and max_depth that the best model had.. Once you have completed the task in line with the
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics
#build our BaggingClassifier model
model = BaggingClassifier( n_estimators=100, random_state=7)
#train it ((easy straightforward))
model.fit(x_train,y_train)
# prediction on test set
y_pred_ = model.predict(x_dev)
bagging_score = metrics.accuracy_score(y_dev, y_pred_)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_dev, y_pred_))
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a random Classifier
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(x_train,y_train)
y_pred=clf.predict(x_dev)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_dev, y_pred))
#import some models that we're going to test on our dataset
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.model_selection import cross_val_score
#build our models (vanilla flavours)
ada_boost = AdaBoostClassifier()
grad_boost = GradientBoostingClassifier()
xgb_boost = xgb.XGBClassifier()
#boost_array = [ada_boost, grad_boost, xgb_boost]
eclf = EnsembleVoteClassifier(clfs=[ada_boost, grad_boost, xgb_boost], voting='hard')
#create their string representators
labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']
#calsulate each model cross validation on the training set
for clf, label in zip([ada_boost, grad_boost, xgb_boost, eclf], labels):
scores = cross_val_score(clf, x_train, y_train, cv=10, scoring='accuracy')
print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))
cause the ggradient boosting classifier perform well we're going to try and tweak it parameter and see if we can get even better result from it on the dev set
#recreate the model and set a deeper depth adn N-estimators at least to 150
grad_boost = GradientBoostingClassifier(n_estimators=150, max_depth=10)
#train the model
clf = grad_boost.fit(x_train, y_train)
#do the prediction
grad_predict = grad_boost.predict(x_dev)
#calculate the accuracy
grad_boost_score = metrics.accuracy_score(y_dev, grad_predict)
#print the result
print("Accuracy:",metrics.accuracy_score(y_dev, grad_predict))
#we start by display the score of our two finalist models (score on the dev set)
print('bagging score:', bagging_score, '\n Grad_boost score:', grad_boost_score)
#make final prediction ofr gradient booster
grad_predict_ = grad_boost.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, grad_predict_))
#make final prediction on the bagging model
y_bagge_predict = model.predict(x_test)
metrics.accuracy_score(y_test, y_bagge_predict)
As a bonus for this article we're going to hava look at how we can implement a simple decision Tree using python(of course)
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import numpy as np
import matplotlib.pyplot as plt
# We are going to use the Iris data set
from sklearn.datasets import load_iris
# import the Decison tree classifier
from sklearn.tree import DecisionTreeClassifier
# Load data
iris = load_iris()
# We only take the two corresponding features
X = iris.data[:, ]
y = iris.target
## Below are samples of parameters that can be used when fitting a tree.
## we will not go indepth with explaining each of them but you van do more research on how to optimize your tree.
## Here is the link: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
max_features=None, max_leaf_nodes=10, min_samples_leaf=5,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='random')
# Training the model
clf = DecisionTreeClassifier(max_depth=9).fit(X, y)
predict1 = clf.predict(X)
from sklearn.metrics import confusion_matrix ###for using confusion matrix###
cm = confusion_matrix(y, predict1)
print (cm)
We are going to visualize the tree using graphviz We will need to install the package. You can do this by simply typing pip3 install graphviz
If you are more conversant with brew install then you can type: brew install graphviz
If you have a latest version of jupyter and python installed on a windows machine.
We will need to install graphviz tool in our system and set the path in environment variables. Visit http://www.graphviz.org/Download..php and find the optimal version for the computer. Get the path for gvedit.exe in install directory(for me it was “C:\Program Files (x86)\Graphviz2.38\bin\”)
goto start->computer->system properties->advanced settings->environment variables and add the path.
We will need python package pydotplus(for older python versions pydot) use this command in your anaconda prompt:
conda install -c conda-forge pydotplus or pip3 install pydotplus
feature_cols =['SepalLength','SepalWidth','PetalLength','PetalWidth']
dot_data = StringIO()
fit = clf.fit(X, y)
export_graphviz(fit, out_file='dot_data.dot',feature_names = feature_cols, rounded = True, proportion = False,
precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'dot_data.dot', '-o', 'dot_data.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'dot_data.png')
we're also going to see how we can generate a Decision Tree on a scarterplot in case you could not get the Graphviz installed.
# we gonna create a function for that
def viziris(iris):
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
[1, 2], [1, 3], [2, 3]]):
# We only take the two corresponding features
X = iris.data[:, pair]
y = iris.target
# Train
clf = DecisionTreeClassifier().fit(X, y)
# Plot the decision boundary
plt.subplot(2, 3, pairidx + 1)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)
plt.xlabel(iris.feature_names[pair[0]])
plt.ylabel(iris.feature_names[pair[1]])
# Plot the training points
for i, color in zip(range(3), "ryb"):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
cmap=plt.cm.RdYlBu, edgecolor='black', s=25)
plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")
plt.show()
viziris(iris)